Exploratory Data Analysis (EDA)

I. Library Import

In [229]:
import pandas as pd
import numpy as np
import plotly.offline as py
import plotly
plotly.offline.init_notebook_mode()
plotly.tools.set_credentials_file(username='stfox13', api_key='Y4QUZyKzXSz7t8P209HF')
from plotly import *
from plotly.graph_objs import *
pd.options.display.float_format = '{0:.2f}'.format
import plotly.graph_objs as go
import cufflinks as cf
%matplotlib inline
mapbox_access_token='pk.eyJ1Ijoic3Rmb3gxMyIsImEiOiJjajJnejd6M2cwMDVmMnlsZ3I4aGI5bmwzIn0.03n_RH-9d3mCBRx9TCNaLw'

II. Data Dictionary

Column Name Definition Data Type
id record identifier string
date Date house was sold datetime
price Price is prediction target numeric
bedrooms Number of Bedrooms/House numeric
bathrooms Number of bathrooms/House numeric
sqft_living square footage of the home numeric
sqft_lot square footage of the lot numeric
floors Total floors (levels) in house numeric
waterfront House which has a view to a waterfront numeric
view Has been viewed numeric
condition How good the condition is (Overall) numeric
grade The perceived state of the home numeric
sqft_above square footage of house apart from basement numeric
sqft_basement square footage of the basement numeric
yr_built Built Year numeric
yr_renovated Year when house was renovated numeric
zipcode Zipcode of the home in question numeric
lat Latitude coordinate numeric
long Longitude coordinate numeric
sqft_living15 Living room area in 2015(implies-- some renovations) This might or might not have affected the lotsize area numeric
sqft_lot15 lotSize area in 2015(implies-- some renovations) numeric

Note: all of the other columns are dummy variables / calculated columns / classifications created for deeper analysis.

III. Dataset Import

In [235]:
df = pd.read_csv("/Users/sfox/Documents/GADataScience/Homework/FinalProject/Dataset2/kc_house_data.csv",
                skipinitialspace=True,
                parse_dates=[1],
                dtype={
                    "id":np.str,
                    "date":np.str,
                    "price":np.int_,
                    "bedrooms":np.int_,
                    "bathrooms":np.float_,
                    "sqft_living":np.int_,
                    "sqft_lot":np.int_,
                    "floors":np.float_,
                    "waterfront":np.int_,
                    "view":np.int_,
                    "condition":np.int_,
                    "grade":np.int_,
                    "sqft_above":np.int_,
                    "sqft_basement":np.int_,
                    "yr_built":np.int_,
                    "yr_renovated":np.int_,
                    "zipcode":np.str,
                    "lat":np.float_,
                    "long":np.float_,
                    "sqft_living15":np.float_,
                    "sqft_lot15":np.float_
                })
df['decade_built']=df['yr_built'].astype('str').str.slice(start=0,stop=3)+"0"
df['renovated']=np.where(df['yr_renovated']==0,0,1)
df['year']=df['date'].dt.year
df['price_sqft_living'] = df['price'].astype('float')/df['sqft_living'].astype('float')
df['price_sqft_lot'] = df['price'].astype('float')/df['sqft_lot'].astype('float')

IV. Initial Dataset Review

In [236]:
df.head(10)
Out[236]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... zipcode lat long sqft_living15 sqft_lot15 decade_built renovated year price_sqft_living price_sqft_lot
0 7129300520 2014-10-13 221900 3 1.00 1180 5650 1.00 0 0 ... 98178 47.51 -122.26 1340.00 5650.00 1950 0 2014 188.05 39.27
1 6414100192 2014-12-09 538000 3 2.25 2570 7242 2.00 0 0 ... 98125 47.72 -122.32 1690.00 7639.00 1950 1 2014 209.34 74.29
2 5631500400 2015-02-25 180000 2 1.00 770 10000 1.00 0 0 ... 98028 47.74 -122.23 2720.00 8062.00 1930 0 2015 233.77 18.00
3 2487200875 2014-12-09 604000 4 3.00 1960 5000 1.00 0 0 ... 98136 47.52 -122.39 1360.00 5000.00 1960 0 2014 308.16 120.80
4 1954400510 2015-02-18 510000 3 2.00 1680 8080 1.00 0 0 ... 98074 47.62 -122.05 1800.00 7503.00 1980 0 2015 303.57 63.12
5 7237550310 2014-05-12 1225000 4 4.50 5420 101930 1.00 0 0 ... 98053 47.66 -122.00 4760.00 101930.00 2000 0 2014 226.01 12.02
6 1321400060 2014-06-27 257500 3 2.25 1715 6819 2.00 0 0 ... 98003 47.31 -122.33 2238.00 6819.00 1990 0 2014 150.15 37.76
7 2008000270 2015-01-15 291850 3 1.50 1060 9711 1.00 0 0 ... 98198 47.41 -122.31 1650.00 9711.00 1960 0 2015 275.33 30.05
8 2414600126 2015-04-15 229500 3 1.00 1780 7470 1.00 0 0 ... 98146 47.51 -122.34 1780.00 8113.00 1960 0 2015 128.93 30.72
9 3793500160 2015-03-12 323000 3 2.50 1890 6560 2.00 0 0 ... 98038 47.37 -122.03 2390.00 7570.00 2000 0 2015 170.90 49.24

10 rows × 26 columns

In [237]:
df.corr()
Out[237]:
price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade ... yr_built yr_renovated lat long sqft_living15 sqft_lot15 renovated year price_sqft_living price_sqft_lot
price 1.00 0.31 0.53 0.70 0.09 0.26 0.27 0.40 0.04 0.67 ... 0.05 0.13 0.31 0.02 0.59 0.08 0.13 0.00 0.55 0.31
bedrooms 0.31 1.00 0.52 0.58 0.03 0.18 -0.01 0.08 0.03 0.36 ... 0.15 0.02 -0.01 0.13 0.39 0.03 0.02 -0.01 -0.21 -0.06
bathrooms 0.53 0.52 1.00 0.75 0.09 0.50 0.06 0.19 -0.12 0.66 ... 0.51 0.05 0.02 0.22 0.57 0.09 0.05 -0.03 -0.09 0.20
sqft_living 0.70 0.58 0.75 1.00 0.17 0.35 0.10 0.28 -0.06 0.76 ... 0.32 0.06 0.05 0.24 0.76 0.18 0.06 -0.03 -0.09 0.02
sqft_lot 0.09 0.03 0.09 0.17 1.00 -0.01 0.02 0.07 -0.01 0.11 ... 0.05 0.01 -0.09 0.23 0.14 0.72 0.01 0.01 -0.03 -0.21
floors 0.26 0.18 0.50 0.35 -0.01 1.00 0.02 0.03 -0.26 0.46 ... 0.49 0.01 0.05 0.13 0.28 -0.01 0.01 -0.02 0.00 0.48
waterfront 0.27 -0.01 0.06 0.10 0.02 0.02 1.00 0.40 0.02 0.08 ... -0.03 0.09 -0.01 -0.04 0.09 0.03 0.09 -0.00 0.19 0.03
view 0.40 0.08 0.19 0.28 0.07 0.03 0.40 1.00 0.05 0.25 ... -0.05 0.10 0.01 -0.08 0.28 0.07 0.10 0.00 0.22 0.08
condition 0.04 0.03 -0.12 -0.06 -0.01 -0.26 0.02 0.05 1.00 -0.14 ... -0.36 -0.06 -0.01 -0.11 -0.09 -0.00 -0.06 -0.05 0.10 -0.09
grade 0.67 0.36 0.66 0.76 0.11 0.46 0.08 0.25 -0.14 1.00 ... 0.45 0.01 0.11 0.20 0.71 0.12 0.01 -0.03 0.12 0.22
sqft_above 0.61 0.48 0.69 0.88 0.18 0.52 0.07 0.17 -0.16 0.76 ... 0.42 0.02 -0.00 0.34 0.73 0.19 0.02 -0.02 -0.09 -0.00
sqft_basement 0.32 0.30 0.28 0.44 0.02 -0.25 0.08 0.28 0.17 0.17 ... -0.13 0.07 0.11 -0.14 0.20 0.02 0.07 -0.02 -0.03 0.06
yr_built 0.05 0.15 0.51 0.32 0.05 0.49 -0.03 -0.05 -0.36 0.45 ... 1.00 -0.22 -0.15 0.41 0.33 0.07 -0.23 0.00 -0.29 0.12
yr_renovated 0.13 0.02 0.05 0.06 0.01 0.01 0.09 0.10 -0.06 0.01 ... -0.22 1.00 0.03 -0.07 -0.00 0.01 1.00 -0.02 0.11 0.04
lat 0.31 -0.01 0.02 0.05 -0.09 0.05 -0.01 0.01 -0.01 0.11 ... -0.15 0.03 1.00 -0.14 0.05 -0.09 0.03 -0.03 0.47 0.29
long 0.02 0.13 0.22 0.24 0.23 0.13 -0.04 -0.08 -0.11 0.20 ... 0.41 -0.07 -0.14 1.00 0.33 0.25 -0.07 0.00 -0.24 -0.27
sqft_living15 0.59 0.39 0.57 0.76 0.14 0.28 0.09 0.28 -0.09 0.71 ... 0.33 -0.00 0.05 0.33 1.00 0.18 -0.00 -0.02 0.04 -0.04
sqft_lot15 0.08 0.03 0.09 0.18 0.72 -0.01 0.03 0.07 -0.00 0.12 ... 0.07 0.01 -0.09 0.25 0.18 1.00 0.01 -0.00 -0.06 -0.23
renovated 0.13 0.02 0.05 0.06 0.01 0.01 0.09 0.10 -0.06 0.01 ... -0.23 1.00 0.03 -0.07 -0.00 0.01 1.00 -0.02 0.11 0.04
year 0.00 -0.01 -0.03 -0.03 0.01 -0.02 -0.00 0.00 -0.05 -0.03 ... 0.00 -0.02 -0.03 0.00 -0.02 -0.00 -0.02 1.00 0.05 0.01
price_sqft_living 0.55 -0.21 -0.09 -0.09 -0.03 0.00 0.19 0.22 0.10 0.12 ... -0.29 0.11 0.47 -0.24 0.04 -0.06 0.11 0.05 1.00 0.45
price_sqft_lot 0.31 -0.06 0.20 0.02 -0.21 0.48 0.03 0.08 -0.09 0.22 ... 0.12 0.04 0.29 -0.27 -0.04 -0.23 0.04 0.01 0.45 1.00

22 rows × 22 columns

In [238]:
df.describe()
Out[238]:
price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade ... yr_built yr_renovated lat long sqft_living15 sqft_lot15 renovated year price_sqft_living price_sqft_lot
count 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 ... 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00 21613.00
mean 540088.14 3.37 2.11 2079.90 15106.97 1.49 0.01 0.23 3.41 7.66 ... 1971.01 84.40 47.56 -122.21 1986.55 12768.46 0.04 2014.32 264.16 88.97
std 367127.20 0.93 0.77 918.44 41420.51 0.54 0.09 0.77 0.65 1.18 ... 29.37 401.68 0.14 0.14 685.39 27304.18 0.20 0.47 110.06 94.06
min 75000.00 0.00 0.00 290.00 520.00 1.00 0.00 0.00 1.00 1.00 ... 1900.00 0.00 47.16 -122.52 399.00 651.00 0.00 2014.00 87.59 0.16
25% 321950.00 3.00 1.75 1427.00 5040.00 1.00 0.00 0.00 3.00 7.00 ... 1951.00 0.00 47.47 -122.33 1490.00 5100.00 0.00 2014.00 182.29 33.33
50% 450000.00 3.00 2.25 1910.00 7618.00 1.50 0.00 0.00 3.00 7.00 ... 1975.00 0.00 47.57 -122.23 1840.00 7620.00 0.00 2014.00 244.64 59.22
75% 645000.00 4.00 2.50 2550.00 10688.00 2.00 0.00 0.00 4.00 8.00 ... 1997.00 0.00 47.68 -122.12 2360.00 10083.00 0.00 2015.00 318.32 107.59
max 7700000.00 33.00 8.00 13540.00 1651359.00 3.50 1.00 4.00 5.00 13.00 ... 2015.00 2015.00 47.78 -121.31 6210.00 871200.00 1.00 2015.00 810.14 2027.21

8 rows × 22 columns

V. Dataset Creation for Visualizations

In [239]:
priceStats = df.groupby('date').agg({'price':['mean','std']}).reset_index()

upper_bound = go.Scatter(
    name='Upper Bound',
    x=priceStats['date'],
    y=priceStats.price['mean']+priceStats.price['std'],
    mode='lines',
    marker=dict(color="444"),
    line=dict(width=0),
    fillcolor='rgba(68, 68, 68, 0.3)',
    fill='tonexty' )

trace = go.Scatter(
    name='Mean',
    x=priceStats['date'],
    y=priceStats.price['mean'],
    mode='lines',
    line=dict(color='rgb(31, 119, 180)'),
    fillcolor='rgba(68, 68, 68, 0.3)',
    fill='tonexty' )

lower_bound = go.Scatter(
    name='Lower Bound',
    x=priceStats['date'],
    y=priceStats.price['mean']-priceStats.price['std'],
    marker=dict(color="444"),
    line=dict(width=0),
    mode='lines')

data = [lower_bound, trace, upper_bound]

layout = go.Layout(
    yaxis=dict(autotick=False,dtick=250000,title='Home Sale Price'),
    xaxis=dict(title='Transaction Date'),
    title='Average Home Sale Price, King County (WA) <br> Duration: 5/2014 - 5/2015',
    showlegend = False)
fig0 = go.Figure(data=data, layout=layout)
In [240]:
priceStatsByDecade = df[['decade_built','zipcode','price','sqft_lot','sqft_living']]
priceStatsByDecade_1900 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1900']
priceStatsByDecade_1910 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1910']
priceStatsByDecade_1920 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1920']
priceStatsByDecade_1930 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1930']
priceStatsByDecade_1940 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1940']
priceStatsByDecade_1950 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1950']
priceStatsByDecade_1960 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1960']
priceStatsByDecade_1970 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1970']
priceStatsByDecade_1980 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1980']
priceStatsByDecade_1990 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '1990']
priceStatsByDecade_2000 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '2000']
priceStatsByDecade_2010 = priceStatsByDecade[priceStatsByDecade['decade_built'] == '2010']
In [241]:
trace0 = go.Scatter(
    x = priceStatsByDecade_1900['sqft_living'],
    y = priceStatsByDecade_1900['price'],
    name = '1900',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(171, 171, 171)',
        line = dict(
            width = 1
        )
    )
)

trace1 = go.Scatter(
    x = priceStatsByDecade_1910['sqft_living'],
    y = priceStatsByDecade_1910['price'],
    name = '1910',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(174, 117, 117)',
        line = dict(
            width = 1
        )
    )
)

trace2 = go.Scatter(
    x = priceStatsByDecade_1920['sqft_living'],
    y = priceStatsByDecade_1920['price'],
    name = '1920',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(179, 0, 0)',
        line = dict(
            width = 1
        )
    )
)

trace3 = go.Scatter(
    x = priceStatsByDecade_1930['sqft_living'],
    y = priceStatsByDecade_1930['price'],
    name = '1930',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(56, 0, 250)',
        line = dict(
            width = 1
        )
    )
)

trace4 = go.Scatter(
    x = priceStatsByDecade_1940['sqft_living'],
    y = priceStatsByDecade_1940['price'],
    name = '1940',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(52, 148, 52)',
        line = dict(
            width = 1
        )
    )
)

trace5 = go.Scatter(
    x = priceStatsByDecade_1950['sqft_living'],
    y = priceStatsByDecade_1950['price'],
    name = '1950',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(39, 111, 111)',
        line = dict(
            width = 1
        )
    )
)

trace6 = go.Scatter(
    x = priceStatsByDecade_1960['sqft_living'],
    y = priceStatsByDecade_1960['price'],
    name = '1960',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(220, 219, 66)',
        line = dict(
            width = 1
        )
    )
)

trace7 = go.Scatter(
    x = priceStatsByDecade_1970['sqft_living'],
    y = priceStatsByDecade_1970['price'],
    name = '1970',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(229, 173, 0)',
        line = dict(
            width = 1
        )
    )
)

trace8 = go.Scatter(
    x = priceStatsByDecade_1980['sqft_living'],
    y = priceStatsByDecade_1980['price'],
    name = '1980',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(229, 88, 0)',
        line = dict(
            width = 1
        )
    )
)

trace9 = go.Scatter(
    x = priceStatsByDecade_1990['sqft_living'],
    y = priceStatsByDecade_1990['price'],
    name = '1990',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(189, 171, 171)',
        line = dict(
            width = 1
        )
    )
)

trace10 = go.Scatter(
    x = priceStatsByDecade_2000['sqft_living'],
    y = priceStatsByDecade_2000['price'],
    name = '2000',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(81, 0, 74)',
        line = dict(
            width = 1
        )
    )
)

trace11 = go.Scatter(
    x = priceStatsByDecade_2010['sqft_living'],
    y = priceStatsByDecade_2010['price'],
    name = '2010',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(0, 111, 111)',
        line = dict(
            width = 1
        )
    )
)

data = [trace0, trace1,trace2,trace3,trace4,trace5,trace6,trace7,trace8,trace9,trace10,trace11]

layout = dict(
              yaxis = dict(autotick=False,dtick=250000,title='Home Sale Price'),
              xaxis = dict(autotick=False,dtick=1000,title='Living Space (Sqft)'),
              title = 'Correlation of Living Space (Sqft) to Home Sale Price, King County (WA) <br> Color-Coded By Decade',
             )

fig1 = dict(data=data, layout=layout)
In [242]:
trace0 = go.Scatter(
    x = priceStatsByDecade_1900['sqft_lot'],
    y = priceStatsByDecade_1900['price'],
    name = '1900',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(171, 171, 171)',
        line = dict(
            width = 1
        )
    )
)

trace1 = go.Scatter(
    x = priceStatsByDecade_1910['sqft_lot'],
    y = priceStatsByDecade_1910['price'],
    name = '1910',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(174, 117, 117)',
        line = dict(
            width = 1
        )
    )
)

trace2 = go.Scatter(
    x = priceStatsByDecade_1920['sqft_lot'],
    y = priceStatsByDecade_1920['price'],
    name = '1920',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(179, 0, 0)',
        line = dict(
            width = 1
        )
    )
)

trace3 = go.Scatter(
    x = priceStatsByDecade_1930['sqft_lot'],
    y = priceStatsByDecade_1930['price'],
    name = '1930',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(56, 0, 250)',
        line = dict(
            width = 1
        )
    )
)

trace4 = go.Scatter(
    x = priceStatsByDecade_1940['sqft_lot'],
    y = priceStatsByDecade_1940['price'],
    name = '1940',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(52, 148, 52)',
        line = dict(
            width = 1
        )
    )
)

trace5 = go.Scatter(
    x = priceStatsByDecade_1950['sqft_lot'],
    y = priceStatsByDecade_1950['price'],
    name = '1950',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(39, 111, 111)',
        line = dict(
            width = 1
        )
    )
)

trace6 = go.Scatter(
    x = priceStatsByDecade_1960['sqft_lot'],
    y = priceStatsByDecade_1960['price'],
    name = '1960',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(220, 219, 66)',
        line = dict(
            width = 1
        )
    )
)

trace7 = go.Scatter(
    x = priceStatsByDecade_1970['sqft_lot'],
    y = priceStatsByDecade_1970['price'],
    name = '1970',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(229, 173, 0)',
        line = dict(
            width = 1
        )
    )
)

trace8 = go.Scatter(
    x = priceStatsByDecade_1980['sqft_lot'],
    y = priceStatsByDecade_1980['price'],
    name = '1980',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(229, 88, 0)',
        line = dict(
            width = 1
        )
    )
)

trace9 = go.Scatter(
    x = priceStatsByDecade_1990['sqft_lot'],
    y = priceStatsByDecade_1990['price'],
    name = '1990',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(189, 171, 171)',
        line = dict(
            width = 1
        )
    )
)

trace10 = go.Scatter(
    x = priceStatsByDecade_2000['sqft_lot'],
    y = priceStatsByDecade_2000['price'],
    name = '2000',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(81, 0, 74)',
        line = dict(
            width = 1
        )
    )
)

trace11 = go.Scatter(
    x = priceStatsByDecade_2010['sqft_lot'],
    y = priceStatsByDecade_2010['price'],
    name = '2010',
    mode = 'markers',
    marker = dict(
        size = 5,
        color = 'rgb(0, 111, 111)',
        line = dict(
            width = 1
        )
    )
)

data = [trace0, trace1,trace2,trace3,trace4,trace5,trace6,trace7,trace8,trace9,trace10,trace11]

layout = dict(
              yaxis = dict(range=[0,8000000],autotick=False,dtick=250000,title='Home Sale Price'),
              xaxis = dict(title='Lot Space (Sqft)'),
              title = 'Correlation of Lot Space (Sqft) to Home Sale Price, King County (WA) <br> Color-Coded By Decade',
             )

fig2 = dict(data=data, layout=layout)
In [243]:
sqftPriceByDecade = df.groupby('decade_built').agg({'price_sqft_living':'mean','price_sqft_lot':'mean'}).reset_index()
sqftPriceByDecade

trace0 = go.Scatter(
    x=sqftPriceByDecade['price_sqft_living'],
    y=sqftPriceByDecade['decade_built'],
    mode='markers',
    name='Mean Price per Sqft (living)',
    marker=dict(
        color='rgb(81, 0, 74)',
        line=dict(
            color='rgba(156, 165, 196, 1.0)',
            width=1,
        ),
        symbol='circle',
        size=16,
    )
)
trace1 = go.Scatter(
    x=sqftPriceByDecade['price_sqft_lot'],
    y=sqftPriceByDecade['decade_built'],
    mode='markers',
    name='Mean Price per Sqft (lot)',
    marker=dict(
        color='rgb(0, 111, 111)',
        line=dict(
            color='rgba(217, 217, 217, 1.0)',
            width=1,
        ),
        symbol='circle',
        size=16,
    )
)
data = [trace0, trace1]
layout = dict(
              yaxis = dict(range=[1890,2020],autotick=False,dtick=10,ticks='outside',zeroline = False,title='Decade Built'),
              xaxis = dict(autotick=False,dtick=50,ticks='outside',zeroline = False,title='Mean Price per Sqft'),
              title = 'Comparison of Mean Price per Sqft (lot) Versus Mean Price per Sqft (living), King County (WA) <br> Color-Coded By Sqft Classification',
             )

fig3 = go.Figure(data=data, layout=layout)
In [244]:
homeImprovementSqft = df[['waterfront','renovated','view','price_sqft_lot','price_sqft_living']]

trace0 = go.Box(
    y=homeImprovementSqft['price_sqft_living'],
    x=homeImprovementSqft['waterfront'],
    name='Price Per Sqft (living)',
    marker=dict(
        color='rgb(81, 0, 74)'
    )
)

trace1 = go.Box(
    y=homeImprovementSqft['price_sqft_lot'],
    x=homeImprovementSqft['waterfront'],
    name='Price Per Sqft (lot)',
    marker=dict(
        color='rgb(0, 111, 111)'
    )
)


data = [trace0,trace1]

layout = go.Layout(
    xaxis=dict(
        title='Is Waterfront Property<br>(0 = False, 1 = True)'
    ),
    yaxis=dict(
        range=[0,2100],
        autotick=False,dtick=100,ticks='outside',
        title='Price Per Sqft'
    ),
    boxmode='group',
    title = 'Box Plot Comparison of Waterfront versus Non-Waterfront Properties, King County (WA) <br> Color-Coded By Sqft Classification',

)
fig4 = go.Figure(data=data, layout=layout)
In [245]:
trace0 = go.Box(
    y=homeImprovementSqft['price_sqft_living'],
    x=homeImprovementSqft['renovated'],
    name='Price Per Sqft (living)',
    marker=dict(
        color='rgb(81, 0, 74)'
    )
)

trace1 = go.Box(
    y=homeImprovementSqft['price_sqft_lot'],
    x=homeImprovementSqft['renovated'],
    name='Price Per Sqft (lot)',
    marker=dict(
        color='rgb(0, 111, 111)'
    )
)


data = [trace0,trace1]

layout = go.Layout(
    xaxis=dict(
        title='Is Renovated Property<br>(0 = False, 1 = True)'
    ),
    yaxis=dict(
        range=[0,2200],
        autotick=False,dtick=100,ticks='outside',
        title='Price Per Sqft'
    ),
    boxmode='group',
    title = 'Box Plot Comparison of Renovated Properties versus Non-Renovated Properties, King County (WA) <br> Color-Coded By Sqft Classification',

)
fig5 = go.Figure(data=data, layout=layout)
In [246]:
trace0 = go.Histogram(
    x=df['bathrooms'],
    marker=dict(
    color='rgb(0, 111, 111)'
    )
)

data=[trace0]

layout = go.Layout(
    xaxis=dict(
        title='Number of Bathrooms'
    ),
    title='Distribution of Bathrooms, King County (WA)'
)
fig6 = go.Figure(data=data, layout=layout)
In [247]:
trace0 = go.Histogram(
    x=df['bathrooms'],histnorm='probability',
    marker=dict(
    color='rgb(0, 111, 111)'
    )

)

data=[trace0]

layout = go.Layout(
    xaxis=dict(
        title='Number of Bathrooms'
    ),
    title='Normalized Distribution of Bathrooms, King County (WA)'
)
fig7 = go.Figure(data=data, layout=layout)
In [248]:
trace0 = go.Histogram(
    x=df['bedrooms'],
    marker=dict(
    color='rgb(81, 0, 74)'
    )
)

data=[trace0]

layout = go.Layout(
    xaxis=dict(
        title='Number of Bedrooms'
    ),
    title='Distribution of Bedrooms, King County (WA)'
)
fig8 = go.Figure(data=data, layout=layout)
In [249]:
trace0 = go.Histogram(
    x=df['bedrooms'],
    histnorm='probability',
    marker=dict(
    color='rgb(81, 0, 74)'
    )
)

data=[trace0]

layout = go.Layout(
    xaxis=dict(
        title='Number of Bedrooms'
    ),
    title='Normalized Distribution of Bedrooms, King County (WA)'
)
fig9 = go.Figure(data=data, layout=layout)
In [250]:
lats = list(df.lat)
lons = list(df.long)
prices = list(df.price)

data = Data([
    Scattermapbox(
        lat=lats,
        lon=lons,
        mode='markers',
        marker=Marker(
            size=9
        ),
        text=prices,
    )
])
layout = Layout(
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=47.608013,
            lon=-122.335167
        ),
        pitch=0,
        zoom=7.9
    ),
    title='Home Sales by Location, King County (WA)<br>(Will be expanded for final presentation)'
)

fig10 = dict(data=data, layout=layout)

VI. Visualization Generation

In [215]:
py.iplot(fig0)
In [216]:
py.iplot(fig1)
In [217]:
py.iplot(fig2)
In [218]:
py.iplot(fig3)
In [219]:
py.iplot(fig4)
In [220]:
py.iplot(fig5)
In [221]:
py.iplot(fig6)
In [222]:
py.iplot(fig7)
In [223]:
py.iplot(fig8)
In [224]:
py.iplot(fig9)
In [225]:
py.iplot(fig10)

VII. Next Steps
I requested and received GreatSchools API Access. GreatSchools tracks standardized test scores and parent ratings for public and private schools across the United States. I plan to join information extracted from the GreatSchools site via the API to join the housing information and derive proximity to good, average, and bad schools - classifications derived from the standardized test scores found within the data.
At this time, I am having a bit of trouble joining the two datasets - this will be the focus of my efforts moving forward.

In [ ]: